#didn’t like output with the flex_board screen formating so I commented out and made it html. I left it in the code so you saw that I did attempt it.
Load library
library(naniar)
library(flextable)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ purrr::compose() masks flextable::compose()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(stringr)
library(ggplot2)
library(dplyr)
library(knitr)
library(kableExtra)
##
## Attaching package: 'kableExtra'
##
## The following object is masked from 'package:dplyr':
##
## group_rows
##
## The following objects are masked from 'package:flextable':
##
## as_image, footnote
library(ggbeeswarm)
library(skimr)
##
## Attaching package: 'skimr'
##
## The following object is masked from 'package:naniar':
##
## n_complete
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following objects are masked from 'package:flextable':
##
## highlight, style
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(DT)
library(scales)
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
library(boot)
#set theme to bw
theme_set(theme_bw())
Data import
# Import data
census_data <- read.csv("C:/Users/spsky/OneDrive/UoM/R/Final Project/census_data_county_2009-2021.cvs")
# first few rows of the data
head(census_data)
## geoid county_state year population median_income
## 1 1001 Autauga County, Alabama 2009 49584 51463
## 2 1003 Baldwin County, Alabama 2009 171997 48918
## 3 1005 Barbour County, Alabama 2009 29663 32537
## 4 1007 Bibb County, Alabama 2009 21464 41236
## 5 1009 Blount County, Alabama 2009 56804 45406
## 6 1011 Bullock County, Alabama 2009 10917 26209
## median_monthly_rent_cost median_monthly_home_cost prop_female prop_male
## 1 779 854 0.5148233 0.4851767
## 2 788 846 0.5100903 0.4899097
## 3 499 535 0.4711594 0.5288406
## 4 506 466 0.4798733 0.5201267
## 5 541 653 0.5032744 0.4967256
## 6 363 477 0.4352844 0.5647156
## prop_poverty
## 1 0.1030618
## 2 0.1192494
## 3 0.2474965
## 4 0.1189653
## 5 0.1095393
## 6 0.3227418
Exploratory Data Analysis (EDA)
# Data skim
skim(census_data)
| Name | census_data |
| Number of rows | 41867 |
| Number of columns | 10 |
| _______________________ | |
| Column type frequency: | |
| character | 1 |
| numeric | 9 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| county_state | 0 | 1 | 16 | 42 | 0 | 3228 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| geoid | 0 | 1 | 31394.62 | 16291.12 | 1001.00 | 19032.00 | 30025.00 | 46107.00 | 72153.00 | ▅▇▆▆▁ |
| year | 0 | 1 | 2015.00 | 3.74 | 2009.00 | 2012.00 | 2015.00 | 2018.00 | 2021.00 | ▇▅▇▅▇ |
| population | 0 | 1 | 99198.02 | 318133.15 | 41.00 | 11218.00 | 25987.00 | 66375.50 | 10105722.00 | ▇▁▁▁▁ |
| median_income | 4 | 1 | 48034.94 | 14258.33 | 10499.00 | 39167.00 | 46215.00 | 54732.00 | 156821.00 | ▃▇▁▁▁ |
| median_monthly_rent_cost | 30 | 1 | 703.80 | 214.98 | 99.00 | 567.00 | 658.00 | 784.00 | 2599.00 | ▅▇▁▁▁ |
| median_monthly_home_cost | 18 | 1 | 775.03 | 356.61 | 103.00 | 543.00 | 684.00 | 910.00 | 3254.00 | ▇▅▁▁▁ |
| prop_female | 0 | 1 | 0.50 | 0.02 | 0.19 | 0.49 | 0.50 | 0.51 | 0.63 | ▁▁▁▇▁ |
| prop_male | 0 | 1 | 0.50 | 0.02 | 0.37 | 0.49 | 0.50 | 0.51 | 0.81 | ▁▇▁▁▁ |
| prop_poverty | 1 | 1 | 0.17 | 0.08 | 0.00 | 0.11 | 0.15 | 0.20 | 0.67 | ▆▇▁▁▁ |
# Display data structure
str(census_data)
## 'data.frame': 41867 obs. of 10 variables:
## $ geoid : int 1001 1003 1005 1007 1009 1011 1013 1015 1017 1019 ...
## $ county_state : chr "Autauga County, Alabama" "Baldwin County, Alabama" "Barbour County, Alabama" "Bibb County, Alabama" ...
## $ year : int 2009 2009 2009 2009 2009 2009 2009 2009 2009 2009 ...
## $ population : int 49584 171997 29663 21464 56804 10917 20189 112969 34704 24427 ...
## $ median_income : int 51463 48918 32537 41236 45406 26209 31938 38382 32433 40240 ...
## $ median_monthly_rent_cost: int 779 788 499 506 541 363 453 571 552 450 ...
## $ median_monthly_home_cost: int 854 846 535 466 653 477 528 628 556 484 ...
## $ prop_female : num 0.515 0.51 0.471 0.48 0.503 ...
## $ prop_male : num 0.485 0.49 0.529 0.52 0.497 ...
## $ prop_poverty : num 0.103 0.119 0.247 0.119 0.11 ...
# Summary of important variables
summary(census_data)
## geoid county_state year population
## Min. : 1001 Length:41867 Min. :2009 Min. : 41
## 1st Qu.:19032 Class :character 1st Qu.:2012 1st Qu.: 11218
## Median :30025 Mode :character Median :2015 Median : 25987
## Mean :31395 Mean :2015 Mean : 99198
## 3rd Qu.:46107 3rd Qu.:2018 3rd Qu.: 66376
## Max. :72153 Max. :2021 Max. :10105722
##
## median_income median_monthly_rent_cost median_monthly_home_cost
## Min. : 10499 Min. : 99.0 Min. : 103
## 1st Qu.: 39167 1st Qu.: 567.0 1st Qu.: 543
## Median : 46215 Median : 658.0 Median : 684
## Mean : 48035 Mean : 703.8 Mean : 775
## 3rd Qu.: 54732 3rd Qu.: 784.0 3rd Qu.: 910
## Max. :156821 Max. :2599.0 Max. :3254
## NA's :4 NA's :30 NA's :18
## prop_female prop_male prop_poverty
## Min. :0.1917 Min. :0.3736 Min. :0.0000
## 1st Qu.:0.4950 1st Qu.:0.4882 1st Qu.:0.1129
## Median :0.5044 Median :0.4956 Median :0.1515
## Mean :0.5000 Mean :0.5000 Mean :0.1659
## 3rd Qu.:0.5118 3rd Qu.:0.5050 3rd Qu.:0.1973
## Max. :0.6264 Max. :0.8083 Max. :0.6707
## NA's :1
# Missing values
gg_miss_var(census_data)
String Manipulation
# Manipulate strings in the dataset and append new columns with the changes
census_data <- census_data |>
mutate(
new_county_state = str_to_title(county_state),
new_county_state = str_replace_all(new_county_state, " County", ""),
year = str_trim(as.character(year))
)
# Display the first few rows of the updated dataset
head(census_data)
## geoid county_state year population median_income
## 1 1001 Autauga County, Alabama 2009 49584 51463
## 2 1003 Baldwin County, Alabama 2009 171997 48918
## 3 1005 Barbour County, Alabama 2009 29663 32537
## 4 1007 Bibb County, Alabama 2009 21464 41236
## 5 1009 Blount County, Alabama 2009 56804 45406
## 6 1011 Bullock County, Alabama 2009 10917 26209
## median_monthly_rent_cost median_monthly_home_cost prop_female prop_male
## 1 779 854 0.5148233 0.4851767
## 2 788 846 0.5100903 0.4899097
## 3 499 535 0.4711594 0.5288406
## 4 506 466 0.4798733 0.5201267
## 5 541 653 0.5032744 0.4967256
## 6 363 477 0.4352844 0.5647156
## prop_poverty new_county_state
## 1 0.1030618 Autauga, Alabama
## 2 0.1192494 Baldwin, Alabama
## 3 0.2474965 Barbour, Alabama
## 4 0.1189653 Bibb, Alabama
## 5 0.1095393 Blount, Alabama
## 6 0.3227418 Bullock, Alabama
Data Cleaning
census_data_sample <- census_data|>
filter(!is.na(median_income))|>
select(new_county_state, year, population, median_income, prop_female, prop_male, prop_poverty, median_monthly_rent_cost, median_monthly_home_cost)
#cleaned and sampled data
head(census_data_sample)
## new_county_state year population median_income prop_female prop_male
## 1 Autauga, Alabama 2009 49584 51463 0.5148233 0.4851767
## 2 Baldwin, Alabama 2009 171997 48918 0.5100903 0.4899097
## 3 Barbour, Alabama 2009 29663 32537 0.4711594 0.5288406
## 4 Bibb, Alabama 2009 21464 41236 0.4798733 0.5201267
## 5 Blount, Alabama 2009 56804 45406 0.5032744 0.4967256
## 6 Bullock, Alabama 2009 10917 26209 0.4352844 0.5647156
## prop_poverty median_monthly_rent_cost median_monthly_home_cost
## 1 0.1030618 779 854
## 2 0.1192494 788 846
## 3 0.2474965 499 535
## 4 0.1189653 506 466
## 5 0.1095393 541 653
## 6 0.3227418 363 477
Summary statistics Create a Sample of the Data
# Sample of around 500 data observations
census_data_sample <- census_data |> sample_frac(0.012)
# Check the sample size
nrow(census_data_sample)
## [1] 502
Group-level Summary Statistics
# Summary statistics by new_county_state
state_summary <- census_data |>
group_by(new_county_state) |>
summarize(
avg_income = round(mean(median_income, na.rm = TRUE), 0),
avg_population = round(mean(population, na.rm = TRUE), 0)
) |>
rename(
`County, State` = new_county_state,
`Income` = avg_income,
`Population` = avg_population
) |>
arrange(`Income`)
# Convert income and population formats
state_summary <- state_summary |>
mutate(
Income = dollar(Income),
Population = comma(Population)
)
# Display the first 50 rows of the summary table
knitr::kable(head(state_summary, 50), caption = "Average County-level Income Statistics (Lowest 50)")
| County, State | Income | Population |
|---|---|---|
| Adjuntas Municipio, Puerto Rico | $12,481 | 18,731 |
| Guánica Municipio, Puerto Rico | $12,905 | 18,152 |
| Comerío Municipio, Puerto Rico | $13,230 | 19,999 |
| Maricao Municipio, Puerto Rico | $13,334 | 6,206 |
| Lares Municipio, Puerto Rico | $13,500 | 29,208 |
| Lajas Municipio, Puerto Rico | $14,048 | 24,677 |
| San Sebastián Municipio, Puerto Rico | $14,262 | 40,650 |
| Orocovis Municipio, Puerto Rico | $14,427 | 22,519 |
| Las Marías Municipio, Puerto Rico | $14,440 | 9,431 |
| Moca Municipio, Puerto Rico | $14,455 | 38,830 |
| Ciales Municipio, Puerto Rico | $14,604 | 17,970 |
| Mayagüez Municipio, Puerto Rico | $14,639 | 83,133 |
| Yauco Municipio, Puerto Rico | $14,804 | 39,533 |
| Patillas Municipio, Puerto Rico | $15,018 | 18,261 |
| Utuado Municipio, Puerto Rico | $15,094 | 31,281 |
| Salinas Municipio, Puerto Rico | $15,309 | 29,734 |
| Quebradillas Municipio, Puerto Rico | $15,410 | 25,097 |
| Isabela Municipio, Puerto Rico | $15,426 | 44,214 |
| San Germán Municipio, Puerto Rico | $15,481 | 33,967 |
| Corozal Municipio, Puerto Rico | $15,493 | 35,795 |
| Guayanilla Municipio, Puerto Rico | $15,722 | 20,362 |
| Barranquitas Municipio, Puerto Rico | $15,831 | 29,562 |
| Aguada Municipio, Puerto Rico | $15,875 | 40,533 |
| Arroyo Municipio, Puerto Rico | $15,895 | 18,642 |
| Aguadilla Municipio, Puerto Rico | $15,978 | 58,050 |
| Peñuelas Municipio, Puerto Rico | $16,077 | 22,958 |
| Sabana Grande Municipio, Puerto Rico | $16,243 | 24,302 |
| Vieques Municipio, Puerto Rico | $16,261 | 9,000 |
| Jayuya Municipio, Puerto Rico | $16,285 | 15,879 |
| Barceloneta Municipio, Puerto Rico | $16,308 | 24,309 |
| Aguas Buenas Municipio, Puerto Rico | $16,407 | 27,516 |
| Yabucoa Municipio, Puerto Rico | $16,627 | 35,973 |
| Cabo Rojo Municipio, Puerto Rico | $16,646 | 49,976 |
| Guayama Municipio, Puerto Rico | $16,889 | 43,077 |
| Ponce Municipio, Puerto Rico | $17,009 | 155,669 |
| Morovis Municipio, Puerto Rico | $17,169 | 31,797 |
| Naranjito Municipio, Puerto Rico | $17,221 | 29,495 |
| Arecibo Municipio, Puerto Rico | $17,435 | 92,297 |
| Camuy Municipio, Puerto Rico | $17,500 | 34,032 |
| Florida Municipio, Puerto Rico | $17,521 | 12,505 |
| Santa Isabel Municipio, Puerto Rico | $17,739 | 22,514 |
| Coamo Municipio, Puerto Rico | $17,744 | 39,545 |
| Añasco Municipio, Puerto Rico | $17,767 | 28,250 |
| Aibonito Municipio, Puerto Rico | $17,818 | 24,827 |
| Naguabo Municipio, Puerto Rico | $17,846 | 26,138 |
| Villalba Municipio, Puerto Rico | $17,924 | 24,759 |
| Vega Baja Municipio, Puerto Rico | $17,929 | 56,910 |
| San Lorenzo Municipio, Puerto Rico | $17,981 | 39,697 |
| Manatí Municipio, Puerto Rico | $18,082 | 42,258 |
| Hatillo Municipio, Puerto Rico | $18,123 | 41,137 |
Frequency Table of Two Categorical Variables
# Frequency table of county_state and year with average median_monthly_rent_cost
state_year_rent_table <- census_data_sample |>
group_by(new_county_state, year) |>
summarize(
avg_median_rent = round(mean(median_monthly_rent_cost, na.rm = TRUE), 0)
) |>
ungroup() |>
select(`County, State` = new_county_state, Year = year, `Rent` = avg_median_rent) |>
arrange(desc(Rent)) |>
mutate(Rent = dollar(Rent))
## `summarise()` has grouped output by 'new_county_state'. You can override using
## the `.groups` argument.
# Display the first 50 rows of the frequency table
knitr::kable(head(state_year_rent_table, 50), caption = "Frequency Table of County, Year and Average Median Rent \n(Top 50)")
| County, State | Year | Rent |
|---|---|---|
| Arlington, Virginia | 2020 | $2,005 |
| San Francisco, California | 2017 | $1,709 |
| Charles, Maryland | 2018 | $1,661 |
| Queens, New York | 2017 | $1,456 |
| Sacramento, California | 2021 | $1,434 |
| Placer, California | 2016 | $1,339 |
| Ouray, Colorado | 2020 | $1,305 |
| Cherokee, Georgia | 2020 | $1,304 |
| Routt, Colorado | 2019 | $1,282 |
| Broward, Florida | 2017 | $1,271 |
| Northwest Arctic Borough, Alaska | 2021 | $1,266 |
| Los Angeles, California | 2016 | $1,264 |
| Williamson, Texas | 2018 | $1,248 |
| Hays, Texas | 2021 | $1,235 |
| Manassas City, Virginia | 2010 | $1,232 |
| Fulton, Georgia | 2019 | $1,205 |
| Morris, New Jersey | 2009 | $1,199 |
| Rockwall, Texas | 2009 | $1,183 |
| Davidson, Tennessee | 2020 | $1,172 |
| Midland, Texas | 2016 | $1,148 |
| Rockwall, Texas | 2011 | $1,142 |
| Essex, Massachusetts | 2017 | $1,135 |
| Palm Beach, Florida | 2010 | $1,129 |
| Ramsey, Minnesota | 2021 | $1,120 |
| St. Johns, Florida | 2015 | $1,119 |
| Union, New Jersey | 2011 | $1,119 |
| Portsmouth City, Virginia | 2021 | $1,116 |
| Will, Illinois | 2017 | $1,112 |
| Kent, Rhode Island | 2020 | $1,110 |
| Delaware, Pennsylvania | 2020 | $1,109 |
| Goochland, Virginia | 2017 | $1,096 |
| Williamsburg City, Virginia | 2015 | $1,093 |
| Chesapeake City, Virginia | 2011 | $1,090 |
| Chesterfield, Virginia | 2013 | $1,077 |
| Hamilton, Indiana | 2016 | $1,071 |
| James City, Virginia | 2010 | $1,066 |
| Essex, Massachusetts | 2014 | $1,063 |
| Anchorage Municipality, Alaska | 2011 | $1,058 |
| Dare, North Carolina | 2017 | $1,056 |
| Canadian, Oklahoma | 2021 | $1,054 |
| Chesterfield, Virginia | 2012 | $1,052 |
| Baltimore City, Maryland | 2018 | $1,051 |
| Hillsborough, Florida | 2017 | $1,040 |
| Belknap, New Hampshire | 2020 | $1,039 |
| Warren, Virginia | 2019 | $1,020 |
| Sarasota, Florida | 2014 | $1,008 |
| St. Tammany Parish, Louisiana | 2016 | $998 |
| Pinal, Arizona | 2016 | $990 |
| Jefferson, New York | 2019 | $987 |
| Hanover, Virginia | 2010 | $977 |
Data Visualizations Visualization 1: Lowest Average Population
# Filter for the 10 counties with the lowest average populations over the years
lowest_10_population_counties <- census_data_sample |>
group_by(new_county_state) |>
summarize(avg_population = mean(population, na.rm = TRUE)) |>
top_n(-10, avg_population)
# Filter the original data to include only these counties
lowest_10_population_data <- census_data_sample |>
filter(new_county_state %in% lowest_10_population_counties$new_county_state)
# Summarize the average population for each county
lowest_10_population_summary <- lowest_10_population_data |>
group_by(new_county_state) |>
summarize(avg_population = mean(population, na.rm = TRUE))
# Create a horizontal bar chart
ggplot(lowest_10_population_summary, aes(x = reorder(new_county_state, avg_population), y = avg_population)) +
geom_bar(stat = "identity", fill = "lightblue", color = "black") +
coord_flip() +
labs(title = "10 Counties with the \nLowest Average Population", x = "County", y = "Population",
caption = "Source: Census Data Sample") +
scale_y_continuous(expand = expansion(mult = c(0, 0.1)))+
theme(
axis.text.x = element_text(angle = 45, hjust = 1, size = 7)
)
Visualization 2: Median Income Distribution
# Filter for the 10 counties with the highest median incomes
top_10_income_counties <- census_data |>
group_by(new_county_state) |>
summarize(median_income = mean(median_income, na.rm = TRUE)) |>
top_n(10, median_income)
# Filter the original data to include only these counties
top_10_income_data <- census_data |>
filter(new_county_state %in% top_10_income_counties$new_county_state)
# Median income distribution by top 10 counties using census data
ggplot(top_10_income_data, aes(x = new_county_state, y = round(median_income, 0))) +
geom_boxplot(fill = "yellow", color = "black") +
scale_y_continuous(labels = dollar_format(prefix = "$", suffix = "", big.mark = ","), limits = c(80000, 160000), breaks = seq(0, 160000, by = 20000)) +
labs(
title = "Median Income Distribution in the \n(Top 10 Counties)",
x = "County",
y = "Income",
caption = "Source: Census Data") +
theme(
axis.text.x = element_text(angle = 45, hjust = 1, size = 7)
) +
coord_flip()
Visualization 3: Population of Females by Top 10 States
# Filter for the 10 states with the highest average female population
top_10_female_population_states <- census_data |>
group_by(new_county_state) |>
summarize(avg_prop_female = mean(prop_female, na.rm = TRUE)) |>
top_n(10, avg_prop_female)
# Filter the original data to include only these states
top_10_female_population_data <- census_data |>
filter(new_county_state %in% top_10_female_population_states$new_county_state)
# Create a heatmap of the proportion of females by top 10 states using sample data
ggplot(top_10_female_population_data, aes(x = new_county_state, y = year, fill = round(prop_female * 100, 0))) +
geom_tile(color = "white") +
scale_fill_gradient(low = "lightblue", high = "darkblue", name = "Proportion of Females (%)") +
labs(
title = "Population of Females by Top 10 States",
x = "State",
y = "Year",
caption = "Source: Census Data"
) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1, size = 6),
axis.text.y = element_text(size = 6),
legend.text = element_text(size = 6),
legend.title = element_text(size = 6),
plot.title = element_text(size = 10),
plot.caption = element_text(size = 8)
)
Interpretation of the data: We expect that in Norton City, Virginia, in the year 2014, the population of females reached 63%, making it the highest among the top 10 states with the highest average female population. This suggests that you would most likely find a bride in Norton City, Virginia, during that year based on the highest female population proportion.
Population of Males by Top 10 States
# Filter for the 10 states with the highest average male population
top_10_male_population_states <- census_data |>
group_by(new_county_state) |>
summarize(avg_prop_male = mean(prop_male, na.rm = TRUE)) |>
top_n(10, avg_prop_male)
# Filter the original data to include only these states
top_10_male_population_data <- census_data |>
filter(new_county_state %in% top_10_male_population_states$new_county_state)
# Create a heatmap of the proportion of males by top 10 states using sample data
ggplot(top_10_male_population_data, aes(x = new_county_state, y = year, fill = round(prop_male * 100, 0))) +
geom_tile(color = "white") +
scale_fill_gradient(low = "lightblue", high = "darkblue", name = "Proportion of Males (%)") +
labs(title = "Population of Males by Top 10 States ", x = "State", y = "Year",
caption = "Source: Cenus Data") +
theme(
axis.text.x = element_text(angle = 45, hjust = 1, size = 6),
axis.text.y = element_text(size = 6),
legend.text = element_text(size = 6),
legend.title = element_text(size = 6),
plot.title = element_text(size = 10),
plot.caption = element_text(size = 8)
)
Interpretation of the data: In 2017, Forest, Pennsylvania had the highest population of males, reaching up to 80%. This makes it the state and year where you would most likely find a husband based on the highest male population.
Visualization 4: Population in Poverty Trend
lowest_10_poverty_counties <- census_data_sample |>
group_by(new_county_state) |>
summarize(avg_prop_poverty = mean(prop_poverty, na.rm = TRUE)) |>
top_n(-10, avg_prop_poverty)
# Filter the original data to include only these counties
lowest_10_poverty_data <- census_data_sample |>
filter(new_county_state %in% lowest_10_poverty_counties$new_county_state)
# Plot the trend of poverty rates over time for the lowest 10 counties
ggplot(lowest_10_poverty_data, aes(x = year, y = prop_poverty * 100, color = new_county_state, group = new_county_state)) +
geom_line(size = 1) +
geom_point(size = 2) +
scale_y_continuous(labels = function(x) paste0(x, "%")) +
labs(
title = "Trend of Poverty Rates Over Time for the \n10 Counties with the Lowest Average Poverty Rates",
x = "Year",
y = "Population in Poverty",
color = "County",
caption = "Source: Census Data Sample"
)+
theme(axis.text.y = element_text(size = 7),
axis.text.x = element_text(angle = 45, hjust = 1, size = 7)
)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Visualization 5: Interactive comparison of Median income level with Median Rent and Mortgage Costs
# Convert 'year' to numeric
census_data <- census_data |> mutate(year = as.numeric(year))
# Summary of median income, rent cost, and home cost
summary_data <- census_data |>
group_by(year) |>
summarise(
mean_income = round(mean(median_income, na.rm = TRUE), 2),
mean_rent_cost = round(mean(median_monthly_rent_cost, na.rm = TRUE), 2),
mean_home_cost = round(mean(median_monthly_home_cost, na.rm = TRUE), 2)
)
# Melt the data for faceting
summary_data_long <- summary_data |>
pivot_longer(cols = c(mean_income, mean_rent_cost, mean_home_cost),
names_to = "variable",
values_to = "value")
# Recode the variable names
summary_data_long <- summary_data_long |>
mutate(variable = recode(variable,
mean_income = "Income",
mean_rent_cost = "Rent",
mean_home_cost = "Mortgage"))
# Plot the trends
comparison_plot <- ggplot(summary_data_long, aes(x = year, y = value, color = variable)) +
geom_line(size = 1.2) +
scale_y_continuous(labels = dollar_format(prefix = "$", suffix = "", big.mark = ",", accuracy = 0.01)) +
facet_wrap(~ variable, scales = "free_y", ncol = 1) +
labs(
title = 'Comparison of Income, Rent and Mortgage Cost \n(2009-2021)',
x = 'Year',
y = 'Amount',
caption = "Data Source: United States Census Data",
color = "Variable"
)
# Convert ggplot to plotly
ggplotly(comparison_plot)
Beeswarm Plot
# Filter counties with median income between 120,000 and 160,000
income_filtered <- census_data|>
filter(median_income >= 120000 & median_income <= 160000)
# beeswarm plot
ggplot(income_filtered, aes(x = reorder(new_county_state, -median_income), y = median_income)) +
geom_quasirandom(alpha = 0.6, color = "blue") +
labs(title = "Beeswarm Plot of Median Income by County\n(120,000 and 160,000)",
x = "County",
y = "Income",
caption = "Source: Cenus Data") +
scale_y_continuous(labels = dollar_format(prefix = "$", suffix = "", big.mark = ","), breaks = seq(120000, 160000, by = 15000)) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1, size = 7)
) +
guides(color=guide_legend(title="County"))
Permutation Testing
# Permutation test for difference in median income between two counties
county1 <- "Antrim County, Michigan"
county2 <- "Wayne County, Michigan"
# Define the function to compute income difference
income_diff <- function(data, county1, county2) {
mean(data[data$county_state == county1, ]$median_income, na.rm = TRUE) -
mean(data[data$county_state == county2, ]$median_income, na.rm = TRUE)
}
# Calculate the observed difference
observed_diff <- income_diff(census_data, county1, county2)
# Generate permuted differences
permuted_diffs <- replicate(1000, {
permuted_data <- census_data
permuted_data$county_state <- sample(permuted_data$county_state)
income_diff(permuted_data, county1, county2)
})
# Convert the permuted differences to a data frame
permuted_diffs_df <- data.frame(permuted_diffs = permuted_diffs)
# Plot the permuted differences
ggplot(permuted_diffs_df, aes(x = permuted_diffs)) +
geom_histogram(binwidth = 500, fill = "blue", alpha = 0.7, color = "black") +
geom_vline(aes(xintercept = observed_diff), color = "red", linetype = "dashed", size = 1) +
labs(title = "Permutation Test for Median Income Difference", x = "Income Difference", y = "Frequency",
caption = "Source: Cenus Data") +
theme(
axis.text.x = element_text(angle = 45, hjust = 1, size = 7)
)
Bootstrap
# Set seed for reproducibility
set.seed(1971)
# Number of bootstrap samples
B <- 1000
# Instantiating matrix for bootstrap samples
bootstrap_samples <- matrix(NA, nrow = nrow(census_data_sample), ncol = B)
# Sampling with replacement B times
for (b in 1:B) {
bootstrap_samples[, b] <- census_data_sample |>
slice_sample(prop = 1, replace = TRUE) |>
dplyr::pull(median_income)
}
# Calculate bootstrap medians
bootstrap_medians <- apply(bootstrap_samples, 2, median, na.rm = TRUE)
# Calculate bootstrap confidence interval
boot_ci <- quantile(bootstrap_medians, probs = c(0.025, 0.975))
# Create a histogram of the bootstrap medians
ggplot(data.frame(medians = bootstrap_medians), aes(x = round(medians, 0))) +
geom_histogram(binwidth = 500) +
geom_vline(aes(xintercept = round(boot_ci[1], 0)), color = "red") +
geom_vline(aes(xintercept = round(boot_ci[2], 0)), color = "red") +
labs(
title = "Bootstrap Confidence Interval for Median Income",
x = "Income",
y = "Frequency",
caption = "Source: Census Data Sample"
) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1, size = 7)
)
Data Dictionary
# Display data dictionary
data_dictionary <- tibble(
variable = c("geoid", "county_state", "year", "population", "median_income",
"median_monthly_rent_cost", "median_monthly_home_cost", "prop_female",
"prop_male", "prop_poverty"),
description = c("Geographic region ID", "Geographic region", "Year", "Population",
"Median income in dollars", "Median monthly housing costs for homeowners in dollars",
"Median monthly rent costs for renters in dollars", "Proportion of people who are female",
"Proportion of people who are male", "Proportion of people living in poverty")
)
knitr::kable(data_dictionary, caption = "Data Dictionary for Census Data")
| variable | description |
|---|---|
| geoid | Geographic region ID |
| county_state | Geographic region |
| year | Year |
| population | Population |
| median_income | Median income in dollars |
| median_monthly_rent_cost | Median monthly housing costs for homeowners in dollars |
| median_monthly_home_cost | Median monthly rent costs for renters in dollars |
| prop_female | Proportion of people who are female |
| prop_male | Proportion of people who are male |
| prop_poverty | Proportion of people living in poverty |
Extra Credit Population Trends
# Filter data for a sample county
sample_county <- census_data |> dplyr::filter(new_county_state == 'Autauga, Alabama')
# Check the structure of sample_county to ensure it's correct
str(sample_county)
## 'data.frame': 13 obs. of 11 variables:
## $ geoid : int 1001 1001 1001 1001 1001 1001 1001 1001 1001 1001 ...
## $ county_state : chr "Autauga County, Alabama" "Autauga County, Alabama" "Autauga County, Alabama" "Autauga County, Alabama" ...
## $ year : num 2009 2010 2011 2012 2013 ...
## $ population : int 49584 53155 53944 54590 54907 55136 55221 55049 55036 55200 ...
## $ median_income : int 51463 53255 53899 53773 53682 52475 51281 53099 55317 58786 ...
## $ median_monthly_rent_cost: int 779 769 832 836 851 876 883 896 932 966 ...
## $ median_monthly_home_cost: int 854 889 912 918 887 875 878 872 853 877 ...
## $ prop_female : num 0.515 0.515 0.515 0.514 0.512 ...
## $ prop_male : num 0.485 0.485 0.485 0.486 0.488 ...
## $ prop_poverty : num 0.103 0.106 0.109 0.116 0.121 ...
## $ new_county_state : chr "Autauga, Alabama" "Autauga, Alabama" "Autauga, Alabama" "Autauga, Alabama" ...
# Plot population trend
plot <- ggplot(sample_county, aes(x = year, y = population)) +
geom_line(size = 1.2, color = "blue") +
scale_y_continuous(labels = scales::comma) +
labs(
title = 'Population Trend in Autauga County, Alabama',
x = 'Year',
y = 'Population',
caption = "Data Source: Census Data"
)
# Convert ggplot to plotly
ggplotly(plot)
Median Income Summary
# Convert 'year' back to numeric
census_data <- census_data |> mutate(year = as.numeric(year))
# Summary of median income
income_summary <- census_data |>
group_by(year) |>
summarise(mean_income = mean(median_income, na.rm = TRUE))
# Check the summary
print(income_summary)
## # A tibble: 13 × 2
## year mean_income
## <dbl> <dbl>
## 1 2009 42818.
## 2 2010 43613.
## 3 2011 44620.
## 4 2012 44973.
## 5 2013 45265.
## 6 2014 45857.
## 7 2015 46130.
## 8 2016 47252.
## 9 2017 48995.
## 10 2018 50792.
## 11 2019 52648.
## 12 2020 54172.
## 13 2021 57327.
# Plot median income trend
income_plot <- ggplot(income_summary, aes(x = year, y = mean_income)) +
geom_line(size = 1.2, color="blue") +
scale_y_continuous(labels = scales::comma) +
labs(
title = 'Median Income Trend (2009-2021)',
x = 'Year',
y = 'Median Income (in USD)',
caption = "Data Source: United States Census Data"
)
# Convert ggplot to plotly
ggplotly(income_plot)
Median Monthly Rent Cost Summary
# Summary of median monthly rent cost
rent_summary <- census_data |>
group_by(year) |>
summarise(mean_rent = mean(median_monthly_rent_cost, na.rm = TRUE))
# Plot median monthly rent cost trend
rent_plot <- ggplot(rent_summary, aes(x = year, y = mean_rent)) +
geom_line(size = 1.2, color="blue") +
labs(
title = 'Median Monthly Rent Cost Trend (2009-2021)',
x = 'Year',
y = 'Monthly Rent',
caption = "Data Source: Census Data")
# Convert ggplot to plotly
ggplotly(rent_plot)
Key Summary Table
# Summary table of key statistics
key_summary <- census_data |>
group_by(year) |>
summarise(
mean_population = mean(population, na.rm = TRUE),
mean_income = mean(median_income, na.rm = TRUE),
mean_rent = mean(median_monthly_rent_cost, na.rm = TRUE),
mean_home_cost = mean(median_monthly_home_cost, na.rm = TRUE)
) |>
mutate(
mean_population = round(mean_population),
mean_income = round(mean_income),
mean_rent = round(mean_rent),
mean_home_cost = round(mean_home_cost)
)
# Display the summary table
datatable(key_summary, options = list(pageLength = 10)) |>
formatCurrency(columns = c('mean_income', 'mean_rent', 'mean_home_cost'), currency = '', interval = 3, mark = ',') |>
formatRound(columns = 'mean_population', interval = 3, mark = ',', digits = 0)
Key Findings:
Income Distribution: Higher-income counties tend to be located in more economically developed areas, while lower-income counties are often found in less developed regions.
Rent and Mortgage Trends: Rent and mortgage costs have generally increased over time, reflecting economic inflation and housing market trends.
Population Dynamics: Certain counties show significant gender imbalances, which could be due to various socio-economic factors.
Poverty Rates: Counties with the lowest poverty rates consistently show a decline, indicating economic improvements.
The project offers a thorough analysis and visual representations that provide valuable insights into socio-economic trends across various counties in the United States during the period under examination.